Previous Assignment¶

In [1]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



Gender = ['Male','Female']
df= pd.DataFrame()
for i in range(0,10000):
    df.loc[i, 'Age']=random.randint(6,70)
    df.loc[i, 'Gender']=random.choice(Gender)
    weekdays=random.randint(1,4)
    df.loc[i,'PC+TV on weekdays']=weekdays
    weekend=random.randint(2,10)
    df.loc[i,'PC+TV on weekends']=weekend
    tst=(5/7)*(weekdays)+(2/7)*(weekend)
    df.loc[i,'ScreenTime']=tst 
    if tst>2.5:
        df.loc[i, 'Screen_time_exposure']='High'
    else:
        df.loc[i, 'Screen_time_exposure']='Low'
    exercise=random.randint(1,3)
    df.loc[i,'Exercise']=exercise      
df1=df.copy()
print("----------------------Initial dataset-------------------------")
print(df1)
npy_array = df['ScreenTime'].to_numpy()
anomaly_indices = np.random.choice(range(15), size=500, replace=True)
npy_array[anomaly_indices] = np.random.exponential(scale=20, size=500)
tempdf= pd.DataFrame(npy_array,columns =['ScreenTime'])
df['ScreenTime']=tempdf['ScreenTime']
print("One of the attributes has a functional relationship with another attribute. This particular attribute also has 1% to 20% random anomalous data.")
print("------------------------------------First Dataset-----------------------------------------------")
print(df)
df.to_csv('First_Dataset.csv')
plt.subplots(figsize=(10,10))
boxplot = df.boxplot(column=['Age','PC+TV on weekdays', 'PC+TV on weekends', 'ScreenTime','Exercise'], grid=False, rot=45, fontsize=12)  
plt.show()
for i in range (0, 10000):
    slpt=9-df1.ScreenTime[i]+df1.Exercise[i]
    df1.loc[i,'Sleeptime']=slpt
    df.loc[i,'Sleeptime']=slpt
    if slpt>= 9.5:
        slpp="High"
        df1.loc[i, "Sleeping_pattern"]= slpp
    elif slpt>=6 and slpt<9.5:
        slpp="Normal"
        df1.loc[i, "Sleeping_pattern"]= slpp
    else:
        slpp="Low" 
        df1.loc[i, "Sleeping_pattern"]= slpp
    df.loc[i,"Sleeping_pattern"]=slpp

print("One of the attributes has a functional relationship with another attribute. One of the attributes has a functional relationship with two other attributes.They do not have any anomalous data")
print("----------------------------------Second Dataset------------------------------------------------------------")
print(df1)
df1.to_csv('Second_Dataset.csv')
plt.subplots(figsize=(10,10))
boxplot = df1.boxplot(column=['Age','PC+TV on weekdays', 'PC+TV on weekends', 'ScreenTime','Exercise','Sleeptime'], grid=False, rot=45, fontsize=12)  
plt.show()
slp_array = df['Sleeptime'].to_numpy()
anomaly_indices = np.random.choice(range(30), size=1000, replace=True)
slp_array[anomaly_indices] = np.random.exponential(scale=30, size=1000)
temp1df= pd.DataFrame(slp_array,columns =['Sleeptime'])
df['Sleeptime']=temp1df['Sleeptime']
print(" One of the attributes has a functional relationship with another attribute. One of the attributes has a functional relationship with two other attributes and alsohas 1% to 20% random anomalous data.")
print("------------------------------------Third Dataset-----------------------------------------------")
print(df)
df.to_csv('Third_Dataset.csv')
plt.subplots(figsize=(10,10))
boxplot = df.boxplot(column=['Age','PC+TV on weekdays', 'PC+TV on weekends', 'ScreenTime','Exercise','Sleeptime'], grid=False, rot=45, fontsize=12)  
plt.show()
----------------------Initial dataset-------------------------
       Age  Gender  PC+TV on weekdays  PC+TV on weekends  ScreenTime  \
0     38.0    Male                2.0                4.0    2.571429   
1     60.0  Female                3.0                4.0    3.285714   
2     45.0    Male                1.0                5.0    2.142857   
3     10.0    Male                3.0                6.0    3.857143   
4     11.0    Male                2.0                9.0    4.000000   
...    ...     ...                ...                ...         ...   
9995  57.0  Female                3.0                4.0    3.285714   
9996  33.0  Female                2.0                7.0    3.428571   
9997  15.0    Male                1.0                6.0    2.428571   
9998  38.0  Female                1.0                8.0    3.000000   
9999  27.0  Female                4.0                8.0    5.142857   

     Screen_time_exposure  Exercise  
0                    High       1.0  
1                    High       1.0  
2                     Low       1.0  
3                    High       1.0  
4                    High       1.0  
...                   ...       ...  
9995                 High       3.0  
9996                 High       3.0  
9997                  Low       3.0  
9998                 High       2.0  
9999                 High       1.0  

[10000 rows x 7 columns]
One of the attributes has a functional relationship with another attribute. This particular attribute also has 1% to 20% random anomalous data.
------------------------------------First Dataset-----------------------------------------------
       Age  Gender  PC+TV on weekdays  PC+TV on weekends  ScreenTime  \
0     38.0    Male                2.0                4.0   12.260531   
1     60.0  Female                3.0                4.0   10.508764   
2     45.0    Male                1.0                5.0   17.951749   
3     10.0    Male                3.0                6.0   25.047673   
4     11.0    Male                2.0                9.0    7.405383   
...    ...     ...                ...                ...         ...   
9995  57.0  Female                3.0                4.0    3.285714   
9996  33.0  Female                2.0                7.0    3.428571   
9997  15.0    Male                1.0                6.0    2.428571   
9998  38.0  Female                1.0                8.0    3.000000   
9999  27.0  Female                4.0                8.0    5.142857   

     Screen_time_exposure  Exercise  
0                    High       1.0  
1                    High       1.0  
2                     Low       1.0  
3                    High       1.0  
4                    High       1.0  
...                   ...       ...  
9995                 High       3.0  
9996                 High       3.0  
9997                  Low       3.0  
9998                 High       2.0  
9999                 High       1.0  

[10000 rows x 7 columns]
One of the attributes has a functional relationship with another attribute. One of the attributes has a functional relationship with two other attributes.They do not have any anomalous data
----------------------------------Second Dataset------------------------------------------------------------
       Age  Gender  PC+TV on weekdays  PC+TV on weekends  ScreenTime  \
0     38.0    Male                2.0                4.0    2.571429   
1     60.0  Female                3.0                4.0    3.285714   
2     45.0    Male                1.0                5.0    2.142857   
3     10.0    Male                3.0                6.0    3.857143   
4     11.0    Male                2.0                9.0    4.000000   
...    ...     ...                ...                ...         ...   
9995  57.0  Female                3.0                4.0    3.285714   
9996  33.0  Female                2.0                7.0    3.428571   
9997  15.0    Male                1.0                6.0    2.428571   
9998  38.0  Female                1.0                8.0    3.000000   
9999  27.0  Female                4.0                8.0    5.142857   

     Screen_time_exposure  Exercise  Sleeptime Sleeping_pattern  
0                    High       1.0   7.428571           Normal  
1                    High       1.0   6.714286           Normal  
2                     Low       1.0   7.857143           Normal  
3                    High       1.0   6.142857           Normal  
4                    High       1.0   6.000000           Normal  
...                   ...       ...        ...              ...  
9995                 High       3.0   8.714286           Normal  
9996                 High       3.0   8.571429           Normal  
9997                  Low       3.0   9.571429             High  
9998                 High       2.0   8.000000           Normal  
9999                 High       1.0   4.857143              Low  

[10000 rows x 9 columns]
 One of the attributes has a functional relationship with another attribute. One of the attributes has a functional relationship with two other attributes and alsohas 1% to 20% random anomalous data.
------------------------------------Third Dataset-----------------------------------------------
       Age  Gender  PC+TV on weekdays  PC+TV on weekends  ScreenTime  \
0     38.0    Male                2.0                4.0   12.260531   
1     60.0  Female                3.0                4.0   10.508764   
2     45.0    Male                1.0                5.0   17.951749   
3     10.0    Male                3.0                6.0   25.047673   
4     11.0    Male                2.0                9.0    7.405383   
...    ...     ...                ...                ...         ...   
9995  57.0  Female                3.0                4.0    3.285714   
9996  33.0  Female                2.0                7.0    3.428571   
9997  15.0    Male                1.0                6.0    2.428571   
9998  38.0  Female                1.0                8.0    3.000000   
9999  27.0  Female                4.0                8.0    5.142857   

     Screen_time_exposure  Exercise  Sleeptime Sleeping_pattern  
0                    High       1.0  11.891582           Normal  
1                    High       1.0  19.631779           Normal  
2                     Low       1.0  59.365894           Normal  
3                    High       1.0   6.889189           Normal  
4                    High       1.0  24.970680           Normal  
...                   ...       ...        ...              ...  
9995                 High       3.0   8.714286           Normal  
9996                 High       3.0   8.571429           Normal  
9997                  Low       3.0   9.571429             High  
9998                 High       2.0   8.000000           Normal  
9999                 High       1.0   4.857143              Low  

[10000 rows x 9 columns]

Requried Libraries¶

In [2]:
# These lines import the necessary libraries and modules for 
# data manipulation (pandas), numerical computations (numpy), plotting (matplotlib.pyplot), a
# nd file and directory operations (glob and os).

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import glob
import os


### Testing For Stationarity
# This line imports the adfuller function from the statsmodels.tsa.stattools module. 
# The adfuller function is used for performing the Augmented Dickey-Fuller test, 
# which is a statistical test for checking the stationarity of a time series. 
from statsmodels.tsa.stattools import adfuller


# Create subplots with one trace per page using go object of plotly.graph_objs module 
#This line imports the go module from the plotly.graph_objs library. 
#The go module provides objects and functions for creating interactive plots and visualization
import plotly.graph_objs as go

# Import TimeSeries class from darts library
# This line imports the TimeSeries class from the darts library.
# The darts library is a time series forecasting and modeling library in Python
from darts import TimeSeries

# These lines import the warnings module and set a filter to ignore warning messages.
# This is done to suppress any non-critical warning messages that may arise during the execution of the code
import warnings
warnings.filterwarnings("ignore")
In [3]:
# Import TimeSeries class from darts library
# This line imports the TimeSeries class from the darts library.
# The darts library is a time series forecasting and modeling library in Pytho
!pip install darts
Requirement already satisfied: darts in c:\users\yasmi\anaconda3\lib\site-packages (0.24.0)
Requirement already satisfied: matplotlib>=3.3.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (3.7.0)
Requirement already satisfied: pytorch-lightning>=1.5.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (2.0.3)
Requirement already satisfied: tensorboardX>=2.1 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (2.6)
Requirement already satisfied: tqdm>=4.60.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (4.64.1)
Requirement already satisfied: shap>=0.40.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (0.41.0)
Requirement already satisfied: pmdarima>=1.8.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (2.0.3)
Requirement already satisfied: pyod>=0.9.5 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.0.9)
Requirement already satisfied: scikit-learn>=1.0.1 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.2.1)
Requirement already satisfied: torch>=1.8.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.12.1)
Requirement already satisfied: numpy>=1.19.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.23.5)
Requirement already satisfied: requests>=2.22.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (2.28.1)
Requirement already satisfied: prophet>=1.1.1 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.1.4)
Requirement already satisfied: joblib>=0.16.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.1.1)
Requirement already satisfied: holidays>=0.11.1 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (0.26)
Requirement already satisfied: scipy>=1.3.2 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.10.0)
Requirement already satisfied: xgboost>=1.6.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.7.5)
Requirement already satisfied: nfoursid>=1.0.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.0.1)
Requirement already satisfied: statsmodels>=0.13.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (0.13.5)
Requirement already satisfied: lightgbm>=3.2.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (3.3.5)
Requirement already satisfied: tbats>=1.1.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.1.3)
Requirement already satisfied: statsforecast>=1.4 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.5.0)
Requirement already satisfied: catboost>=1.0.6 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.2)
Requirement already satisfied: pandas>=1.0.5 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (1.5.3)
Requirement already satisfied: xarray>=0.17.0 in c:\users\yasmi\anaconda3\lib\site-packages (from darts) (2022.11.0)
Requirement already satisfied: plotly in c:\users\yasmi\anaconda3\lib\site-packages (from catboost>=1.0.6->darts) (5.9.0)
Requirement already satisfied: six in c:\users\yasmi\anaconda3\lib\site-packages (from catboost>=1.0.6->darts) (1.16.0)
Requirement already satisfied: graphviz in c:\users\yasmi\anaconda3\lib\site-packages (from catboost>=1.0.6->darts) (0.20.1)
Requirement already satisfied: python-dateutil in c:\users\yasmi\anaconda3\lib\site-packages (from holidays>=0.11.1->darts) (2.8.2)
Requirement already satisfied: wheel in c:\users\yasmi\anaconda3\lib\site-packages (from lightgbm>=3.2.0->darts) (0.38.4)
Requirement already satisfied: pillow>=6.2.0 in c:\users\yasmi\anaconda3\lib\site-packages (from matplotlib>=3.3.0->darts) (9.4.0)
Requirement already satisfied: packaging>=20.0 in c:\users\yasmi\anaconda3\lib\site-packages (from matplotlib>=3.3.0->darts) (22.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\yasmi\anaconda3\lib\site-packages (from matplotlib>=3.3.0->darts) (1.4.4)
Requirement already satisfied: cycler>=0.10 in c:\users\yasmi\anaconda3\lib\site-packages (from matplotlib>=3.3.0->darts) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\yasmi\anaconda3\lib\site-packages (from matplotlib>=3.3.0->darts) (4.25.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\yasmi\anaconda3\lib\site-packages (from matplotlib>=3.3.0->darts) (3.0.9)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\yasmi\anaconda3\lib\site-packages (from matplotlib>=3.3.0->darts) (1.0.5)
Requirement already satisfied: pytz>=2020.1 in c:\users\yasmi\anaconda3\lib\site-packages (from pandas>=1.0.5->darts) (2022.7)
Requirement already satisfied: Cython!=0.29.18,!=0.29.31,>=0.29 in c:\users\yasmi\anaconda3\lib\site-packages (from pmdarima>=1.8.0->darts) (0.29.35)
Requirement already satisfied: urllib3 in c:\users\yasmi\anaconda3\lib\site-packages (from pmdarima>=1.8.0->darts) (1.26.14)
Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in c:\users\yasmi\anaconda3\lib\site-packages (from pmdarima>=1.8.0->darts) (65.6.3)
Requirement already satisfied: LunarCalendar>=0.0.9 in c:\users\yasmi\anaconda3\lib\site-packages (from prophet>=1.1.1->darts) (0.0.9)
Requirement already satisfied: importlib-resources in c:\users\yasmi\anaconda3\lib\site-packages (from prophet>=1.1.1->darts) (5.12.0)
Requirement already satisfied: cmdstanpy>=1.0.4 in c:\users\yasmi\anaconda3\lib\site-packages (from prophet>=1.1.1->darts) (1.1.0)
Requirement already satisfied: convertdate>=2.1.2 in c:\users\yasmi\anaconda3\lib\site-packages (from prophet>=1.1.1->darts) (2.4.0)
Requirement already satisfied: numba>=0.51 in c:\users\yasmi\anaconda3\lib\site-packages (from pyod>=0.9.5->darts) (0.56.4)
Requirement already satisfied: torchmetrics>=0.7.0 in c:\users\yasmi\anaconda3\lib\site-packages (from pytorch-lightning>=1.5.0->darts) (0.11.4)
Requirement already satisfied: typing-extensions>=4.0.0 in c:\users\yasmi\anaconda3\lib\site-packages (from pytorch-lightning>=1.5.0->darts) (4.4.0)
Requirement already satisfied: PyYAML>=5.4 in c:\users\yasmi\anaconda3\lib\site-packages (from pytorch-lightning>=1.5.0->darts) (6.0)
Requirement already satisfied: fsspec[http]>2021.06.0 in c:\users\yasmi\anaconda3\lib\site-packages (from pytorch-lightning>=1.5.0->darts) (2022.11.0)
Requirement already satisfied: lightning-utilities>=0.7.0 in c:\users\yasmi\anaconda3\lib\site-packages (from pytorch-lightning>=1.5.0->darts) (0.8.0)
Requirement already satisfied: idna<4,>=2.5 in c:\users\yasmi\anaconda3\lib\site-packages (from requests>=2.22.0->darts) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\yasmi\anaconda3\lib\site-packages (from requests>=2.22.0->darts) (2.0.4)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\yasmi\anaconda3\lib\site-packages (from requests>=2.22.0->darts) (2022.12.7)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\yasmi\anaconda3\lib\site-packages (from scikit-learn>=1.0.1->darts) (2.2.0)
Requirement already satisfied: slicer==0.0.7 in c:\users\yasmi\anaconda3\lib\site-packages (from shap>=0.40.0->darts) (0.0.7)
Requirement already satisfied: cloudpickle in c:\users\yasmi\anaconda3\lib\site-packages (from shap>=0.40.0->darts) (2.0.0)
Requirement already satisfied: plotly-resampler in c:\users\yasmi\anaconda3\lib\site-packages (from statsforecast>=1.4->darts) (0.8.3.2)
Requirement already satisfied: fugue>=0.8.1 in c:\users\yasmi\anaconda3\lib\site-packages (from statsforecast>=1.4->darts) (0.8.5)
Requirement already satisfied: patsy>=0.5.2 in c:\users\yasmi\anaconda3\lib\site-packages (from statsmodels>=0.13.0->darts) (0.5.3)
Requirement already satisfied: protobuf<4,>=3.8.0 in c:\users\yasmi\anaconda3\lib\site-packages (from tensorboardX>=2.1->darts) (3.20.3)
Requirement already satisfied: colorama in c:\users\yasmi\anaconda3\lib\site-packages (from tqdm>=4.60.0->darts) (0.4.6)
Requirement already satisfied: pymeeus<=1,>=0.3.13 in c:\users\yasmi\anaconda3\lib\site-packages (from convertdate>=2.1.2->prophet>=1.1.1->darts) (0.5.12)
Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in c:\users\yasmi\anaconda3\lib\site-packages (from fsspec[http]>2021.06.0->pytorch-lightning>=1.5.0->darts) (3.8.4)
Requirement already satisfied: adagio>=0.2.4 in c:\users\yasmi\anaconda3\lib\site-packages (from fugue>=0.8.1->statsforecast>=1.4->darts) (0.2.4)
Requirement already satisfied: jinja2 in c:\users\yasmi\anaconda3\lib\site-packages (from fugue>=0.8.1->statsforecast>=1.4->darts) (3.1.2)
Requirement already satisfied: fugue-sql-antlr>=0.1.6 in c:\users\yasmi\anaconda3\lib\site-packages (from fugue>=0.8.1->statsforecast>=1.4->darts) (0.1.6)
Requirement already satisfied: triad>=0.9.0 in c:\users\yasmi\anaconda3\lib\site-packages (from fugue>=0.8.1->statsforecast>=1.4->darts) (0.9.0)
Requirement already satisfied: sqlglot in c:\users\yasmi\anaconda3\lib\site-packages (from fugue>=0.8.1->statsforecast>=1.4->darts) (16.1.0)
Requirement already satisfied: qpd>=0.4.3 in c:\users\yasmi\anaconda3\lib\site-packages (from fugue>=0.8.1->statsforecast>=1.4->darts) (0.4.3)
Requirement already satisfied: pyarrow>=0.15.1 in c:\users\yasmi\anaconda3\lib\site-packages (from fugue>=0.8.1->statsforecast>=1.4->darts) (12.0.1)
Requirement already satisfied: ephem>=3.7.5.3 in c:\users\yasmi\anaconda3\lib\site-packages (from LunarCalendar>=0.0.9->prophet>=1.1.1->darts) (4.1.4)
Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in c:\users\yasmi\anaconda3\lib\site-packages (from numba>=0.51->pyod>=0.9.5->darts) (0.39.1)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\yasmi\anaconda3\lib\site-packages (from plotly->catboost>=1.0.6->darts) (8.0.1)
Requirement already satisfied: trace-updater>=0.0.8 in c:\users\yasmi\anaconda3\lib\site-packages (from plotly-resampler->statsforecast>=1.4->darts) (0.0.9.1)
Requirement already satisfied: dash<3.0.0,>=2.2.0 in c:\users\yasmi\anaconda3\lib\site-packages (from plotly-resampler->statsforecast>=1.4->darts) (2.10.2)
Requirement already satisfied: orjson<4.0.0,>=3.8.0 in c:\users\yasmi\anaconda3\lib\site-packages (from plotly-resampler->statsforecast>=1.4->darts) (3.9.1)
Requirement already satisfied: jupyter-dash>=0.4.2 in c:\users\yasmi\anaconda3\lib\site-packages (from plotly-resampler->statsforecast>=1.4->darts) (0.4.2)
Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in c:\users\yasmi\anaconda3\lib\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>2021.06.0->pytorch-lightning>=1.5.0->darts) (4.0.2)
Requirement already satisfied: yarl<2.0,>=1.0 in c:\users\yasmi\anaconda3\lib\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>2021.06.0->pytorch-lightning>=1.5.0->darts) (1.9.2)
Requirement already satisfied: attrs>=17.3.0 in c:\users\yasmi\anaconda3\lib\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>2021.06.0->pytorch-lightning>=1.5.0->darts) (22.1.0)
Requirement already satisfied: frozenlist>=1.1.1 in c:\users\yasmi\anaconda3\lib\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>2021.06.0->pytorch-lightning>=1.5.0->darts) (1.3.3)
Requirement already satisfied: aiosignal>=1.1.2 in c:\users\yasmi\anaconda3\lib\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>2021.06.0->pytorch-lightning>=1.5.0->darts) (1.3.1)
Requirement already satisfied: multidict<7.0,>=4.5 in c:\users\yasmi\anaconda3\lib\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>2021.06.0->pytorch-lightning>=1.5.0->darts) (6.0.4)
Requirement already satisfied: dash-core-components==2.0.0 in c:\users\yasmi\anaconda3\lib\site-packages (from dash<3.0.0,>=2.2.0->plotly-resampler->statsforecast>=1.4->darts) (2.0.0)
Requirement already satisfied: Flask<2.3.0,>=1.0.4 in c:\users\yasmi\anaconda3\lib\site-packages (from dash<3.0.0,>=2.2.0->plotly-resampler->statsforecast>=1.4->darts) (2.2.2)
Requirement already satisfied: dash-table==5.0.0 in c:\users\yasmi\anaconda3\lib\site-packages (from dash<3.0.0,>=2.2.0->plotly-resampler->statsforecast>=1.4->darts) (5.0.0)
Requirement already satisfied: Werkzeug<2.3.0 in c:\users\yasmi\anaconda3\lib\site-packages (from dash<3.0.0,>=2.2.0->plotly-resampler->statsforecast>=1.4->darts) (2.2.2)
Requirement already satisfied: dash-html-components==2.0.0 in c:\users\yasmi\anaconda3\lib\site-packages (from dash<3.0.0,>=2.2.0->plotly-resampler->statsforecast>=1.4->darts) (2.0.0)
Requirement already satisfied: antlr4-python3-runtime<4.12,>=4.11.1 in c:\users\yasmi\anaconda3\lib\site-packages (from fugue-sql-antlr>=0.1.6->fugue>=0.8.1->statsforecast>=1.4->darts) (4.11.1)
Requirement already satisfied: ipykernel in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (6.19.2)
Requirement already satisfied: nest-asyncio in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (1.5.6)
Requirement already satisfied: ansi2html in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (1.8.0)
Requirement already satisfied: retrying in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (1.3.4)
Requirement already satisfied: ipython in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (8.10.0)
Requirement already satisfied: fs in c:\users\yasmi\anaconda3\lib\site-packages (from triad>=0.9.0->fugue>=0.8.1->statsforecast>=1.4->darts) (2.4.16)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jinja2->fugue>=0.8.1->statsforecast>=1.4->darts) (2.1.1)
Requirement already satisfied: click>=8.0 in c:\users\yasmi\anaconda3\lib\site-packages (from Flask<2.3.0,>=1.0.4->dash<3.0.0,>=2.2.0->plotly-resampler->statsforecast>=1.4->darts) (8.0.4)
Requirement already satisfied: itsdangerous>=2.0 in c:\users\yasmi\anaconda3\lib\site-packages (from Flask<2.3.0,>=1.0.4->dash<3.0.0,>=2.2.0->plotly-resampler->statsforecast>=1.4->darts) (2.0.1)
Requirement already satisfied: appdirs~=1.4.3 in c:\users\yasmi\anaconda3\lib\site-packages (from fs->triad>=0.9.0->fugue>=0.8.1->statsforecast>=1.4->darts) (1.4.4)
Requirement already satisfied: psutil in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (5.9.0)
Requirement already satisfied: debugpy>=1.0 in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (1.5.1)
Requirement already satisfied: tornado>=6.1 in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (6.1)
Requirement already satisfied: comm>=0.1.1 in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.1.2)
Requirement already satisfied: pyzmq>=17 in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (23.2.0)
Requirement already satisfied: matplotlib-inline>=0.1 in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.1.6)
Requirement already satisfied: traitlets>=5.4.0 in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (5.7.1)
Requirement already satisfied: jupyter-client>=6.1.12 in c:\users\yasmi\anaconda3\lib\site-packages (from ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (7.3.4)
Requirement already satisfied: backcall in c:\users\yasmi\anaconda3\lib\site-packages (from ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.2.0)
Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.30 in c:\users\yasmi\anaconda3\lib\site-packages (from ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (3.0.36)
Requirement already satisfied: pickleshare in c:\users\yasmi\anaconda3\lib\site-packages (from ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.7.5)
Requirement already satisfied: decorator in c:\users\yasmi\anaconda3\lib\site-packages (from ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (5.1.1)
Requirement already satisfied: jedi>=0.16 in c:\users\yasmi\anaconda3\lib\site-packages (from ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.18.1)
Requirement already satisfied: stack-data in c:\users\yasmi\anaconda3\lib\site-packages (from ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.2.0)
Requirement already satisfied: pygments>=2.4.0 in c:\users\yasmi\anaconda3\lib\site-packages (from ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (2.11.2)
Requirement already satisfied: parso<0.9.0,>=0.8.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jedi>=0.16->ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.8.3)
Requirement already satisfied: jupyter-core>=4.9.2 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-client>=6.1.12->ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (5.2.0)
Requirement already satisfied: entrypoints in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-client>=6.1.12->ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.4)
Requirement already satisfied: wcwidth in c:\users\yasmi\anaconda3\lib\site-packages (from prompt-toolkit<3.1.0,>=3.0.30->ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.2.5)
Requirement already satisfied: executing in c:\users\yasmi\anaconda3\lib\site-packages (from stack-data->ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.8.3)
Requirement already satisfied: asttokens in c:\users\yasmi\anaconda3\lib\site-packages (from stack-data->ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (2.0.5)
Requirement already satisfied: pure-eval in c:\users\yasmi\anaconda3\lib\site-packages (from stack-data->ipython->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (0.2.2)
Requirement already satisfied: pywin32>=1.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-core>=4.9.2->jupyter-client>=6.1.12->ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (305.1)
Requirement already satisfied: platformdirs>=2.5 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-core>=4.9.2->jupyter-client>=6.1.12->ipykernel->jupyter-dash>=0.4.2->plotly-resampler->statsforecast>=1.4->darts) (2.5.2)

Helping Visualization functions¶

In [4]:
def plot(df,x_feature_name,y_feature_name,title):
    """
    This function takes two dataframes as input and plots the number of calls per day and per week.

    Args:
    daily_df (pandas.DataFrame): A dataframe containing daily call data.
    weekly_df (pandas.DataFrame): A dataframe containing weekly call data.

    Returns:
    None
    """

    # Create a subplot with two rows and one column
    # fig = make_subplots(rows=2, cols=1)
    fig = go.Figure()
    # Add a trace for daily calls
    fig.add_trace(
        go.Scatter(
            x=df[x_feature_name],
            y=df[y_feature_name],
            name=y_feature_name,
            mode='lines+markers'
        ))

 

    # Update xaxis properties
    fig.update_xaxes(title_text='Date')

    # Update yaxis properties
    fig.update_yaxes(title_text=y_feature_name)

    # Update title and height
    fig.update_layout(
        title=f'{title}',
        height=500,
        width=1200
    )

    # Show the plot
    fig.show()

    # Write the plot to an HTML file
    # fig.write_html(f'Visualization/btc.html')


def train_test_predicted_plot(df_train,
                              df_test,
                              x_feature ,
                              y_feature,
                              predicted, 
                              model_name):
    """
    Plots the training data, actual values, and forecasted values using Plotly.

    Args:
        train (pd.Series): The training data.
        test (pd.Series): The actual values.
        predicted (pd.Series): The forecasted values.
        model_name (str): The name of the forecasting model.

    Returns:
        None
    """
  

   
    # Create a subplot with two rows and one column
    fig = go.Figure()

    fig.add_trace(
    go.Scatter(
        x=df_train[x_feature],
        y=df_train[y_feature],
        name='Training Data',
        mode='lines+markers'
    ))

    # Add a trace for actual values
    fig.add_trace(
        go.Scatter(
            x=df_test[x_feature],
            y=df_test[y_feature],
            name='Actual Values',
            mode='lines+markers'
        )
    )

    # Add a trace for forecasted values
    fig.add_trace(
        go.Scatter(
            x=predicted[x_feature],
            y=predicted[y_feature],
            name=f'{model_name}',
            mode='lines+markers'
        )
    )

    # Update xaxis properties
    fig.update_xaxes(title_text='Time')

    # Update yaxis properties
    fig.update_yaxes(title_text=y_feature)

    # Update title and height
    fig.update_layout(
        title=f'Forecasting using {model_name}',
        height=500,
        width=1500
    )

    # Save the plot as an HTML file
    fig.show()
    # fig.write_html(f'Visualization/forecasting_using_{model_name}'+'.html')
    # fig.write_imag

Data Analysis¶

In [5]:
df = pd.read_csv('dataset.csv')
df['date'] = pd.to_datetime(df['date'])
display(df)
Unnamed: 0.1 Unnamed: 0 date sleep_hours
0 0 0 2015-02-19 6.400000
1 1 1 2015-02-20 7.583333
2 2 2 2015-02-21 6.350000
3 3 3 2015-02-22 6.500000
4 4 4 2015-02-23 8.916667
... ... ... ... ...
2349 2349 2349 2021-12-25 7.933333
2350 2350 2350 2021-12-26 3.850000
2351 2351 2351 2021-12-29 6.175000
2352 2352 2352 2021-12-30 5.158333
2353 2353 2353 2021-12-31 5.908333

2354 rows × 4 columns

Finding outliers¶

In [6]:
df.to_csv('dataset.csv')
plt.subplots(figsize=(10,10))
boxplot1 = df.boxplot(column=['sleep_hours'], grid=False, rot=45, fontsize=12)  
plt.show()

Handling outliers¶

In [7]:
numerical_cols = ['sleep_hours']
# Calculate z-scores for each numerical column
z_scores = np.abs((df[numerical_cols] - df[numerical_cols].mean()) / df[numerical_cols].std())
# Set the threshold for outlier detection (e.g., z-score > 3)
outlier_threshold = 14
# Identify rows with outliers
outlier_rows = z_scores.apply(lambda row: any(row > outlier_threshold), axis=1)
# Remove rows with outliers from the DataFrame
df_cleaned = df[~outlier_rows]
# Save the cleaned DataFrame to a new file
df.to_csv('cleaned_dataset.csv', index=False)
print(df)
      Unnamed: 0.1  Unnamed: 0       date  sleep_hours
0                0           0 2015-02-19     6.400000
1                1           1 2015-02-20     7.583333
2                2           2 2015-02-21     6.350000
3                3           3 2015-02-22     6.500000
4                4           4 2015-02-23     8.916667
...            ...         ...        ...          ...
2349          2349        2349 2021-12-25     7.933333
2350          2350        2350 2021-12-26     3.850000
2351          2351        2351 2021-12-29     6.175000
2352          2352        2352 2021-12-30     5.158333
2353          2353        2353 2021-12-31     5.908333

[2354 rows x 4 columns]
In [43]:
# This line prints the shape of the DataFrame df using the shape attribute. 
# The shape attribute returns a tuple representing the dimensions of the DataFrame, 
# with the number of rows and columns
df = pd.read_csv('cleaned_dataset.csv')
print('Shape of the Data ',df.shape)
print('\n')
# These lines print the statistics report of the training data stored in the DataFrame df. 
# The describe() function calculates various summary statistics of the numerical columns in the DataFrame,
# such as count, mean, standard deviation, minimum value, and quartiles. The comment is incomplete
print('Statistics Report of Data')
print(df.describe())
Shape of the Data  (2354, 3)


Statistics Report of Data
        Unnamed: 0  sleep_hours
count  2354.000000  2354.000000
mean   1176.500000     7.356560
std     679.685589     2.213308
min       0.000000     1.266667
25%     588.250000     6.235417
50%    1176.500000     6.816667
75%    1764.750000     7.483333
max    2353.000000    17.433333

Missing Data/Days¶

In [8]:
# This line converts the 'date' column in the DataFrame df to datetime 
# format using the pd.to_datetime() function. 
# This is done to ensure that the 'date' column is recognized 
# and processed as dates for further analysis
df['date'] = pd.to_datetime(df['date'])

# This line creates a new pandas DatetimeIndex object called complete_dates using the pd.date_range() function.
# It generates a range of dates starting from the minimum date in the 'date' column of the DataFrame
# df to the maximum date, with a frequency of one day (freq='D'). 
# This will be used to create a complete sequence of dates
complete_dates = pd.date_range(start=df['date'].min() ,end=df['date'].max(),freq='D' )
# This line creates a new DataFrame called completed_dates_df with a single column named 'date' using the pd.DataFrame() function.
completed_dates_df = pd.DataFrame({'date':complete_dates})

# This line merges the completed_dates_df DataFrame with the original DataFrame df based on the 'date' column.
# It performs a left join (how='left'), which means that all the dates from completed_dates_df are included,
# and the corresponding data from df is merged where available.
merged_df = pd.merge(completed_dates_df,df,on='date',how='left')

# This line creates a new DataFrame called missing_days by filtering the merged_df_train DataFrame. 
# It selects only the rows where the 'sleep_hours' column has missing values (NaN)
missing_days = merged_df[merged_df['sleep_hours'].isnull()]

print('Missing Values in days:\n',missing_days.shape[0])

print('Missing Day or Index')
display(missing_days)
Missing Values in days:
 154
Missing Day or Index
date Unnamed: 0.1 Unnamed: 0 sleep_hours
14 2015-03-05 NaN NaN NaN
15 2015-03-06 NaN NaN NaN
16 2015-03-07 NaN NaN NaN
18 2015-03-09 NaN NaN NaN
22 2015-03-13 NaN NaN NaN
... ... ... ... ...
2390 2021-09-05 NaN NaN NaN
2399 2021-09-14 NaN NaN NaN
2469 2021-11-23 NaN NaN NaN
2503 2021-12-27 NaN NaN NaN
2504 2021-12-28 NaN NaN NaN

154 rows × 4 columns

In [6]:
# Visualize the missing days
# This line creates a new figure for the plot with a specified size of 20 units in width and 4 units in height
plt.figure(figsize=(20, 4))
# This line plots the data on the created figure. It uses the plot() function from matplotlib.pyplot to 
# plot the availability of sleep data (merged_df_train['sleep_hours'].notnull()) 
# against the index of the merged_df_train DataFrame (merged_df_train.index). 
# The markers are set to 'o' (circle), the linestyle is set to '-' (solid line), 
# and the linewidth is set to 0.5. This line essentially visualizes the availability of sleep data for each day
plt.plot(merged_df.index, merged_df['sleep_hours'].notnull(), marker='o', linestyle='-', linewidth=0.5)

# These lines add a title to the plot as "Missing Days", set the label for the x-axis as "Date", 
# set the label for the y-axis as "Availability", and enable grid lines on the plot
plt.title('Missing Days')
plt.xlabel('Date')
plt.ylabel('Availability')
plt.grid(True)

# Show the plot
plt.show()

# Summary :
    # The code visualizes the availability of sleep data for each day by plotting a graph. It uses the plot() function from matplotlib.pyplot
    # to create a line plot where the x-axis represents the dates and the y-axis represents the availability of sleep data. 
    # The plot shows markers for the presence or absence of sleep data on each day. 
    # The resulting visualization provides a quick overview of the missing days where sleep data is not available

Filling Missing Day using interpolation¶

In [9]:
# Fill missing values using linear interpolation
# This line fills the missing values in the 'sleep_hours' column of the DataFrame merged_df using linear interpolation.
# The interpolate() function is applied to the 'sleep_hours' column, and the method='nearest' parameter specifies that 
# the nearest non-null value should be used to fill the missing values
merged_df['sleep_hours'] = merged_df['sleep_hours'].interpolate(method='linear')

# Calculate the number of missing days
# These lines calculate the number of missing days in the 'sleep_hours' column. The isnull() function is used to 
# identify the missing values, and sum() calculates the total count of missing values. The result is stored in
# the variable missing_days, and then printed with an accompanying message.
missing_days = merged_df['sleep_hours'].isnull().sum()
print('Number of missing days:', missing_days)

# Visualize the filled data
plt.figure(figsize=(25, 4))
plt.plot(merged_df['date'], merged_df['sleep_hours'], marker='o', linestyle='-', linewidth=2)
plt.title('Sleep Hours by Date (Interpolated)')
plt.xlabel('Date')
plt.ylabel('Sleep Hours')
plt.grid(True)

# Show the plot
plt.show()

# Summary of the code:
# The code fills the missing values in the 'sleep_hours' column of the DataFrame using linear interpolation.
# It then calculates the number of missing days and prints the count. After that, it visualizes the filled data 
# by creating a line plot with dates on the x-axis and sleep hours on the y-axis. The plot shows the interpolated 
# values and provides an overview of the sleep hours by date.
Number of missing days: 0

Data Distribution¶

In [10]:
# This line imports the Plotly Express module, which provides a high-level interface for creating interactive plots.
import plotly.express as px

# This line creates a histogram figure using the px.histogram() function from Plotly Express. 
# It takes the DataFrame merged_df as input and specifies that the 'sleep_hours' column should be used as the x-axis variable. 
# The title parameter sets the title of the plot to 'Distribution of the Training Data', and nbins=10 specifies the number of bins in the histogram
fig = px.histogram(merged_df, x='sleep_hours', title='Distribution of the Training Data', nbins=10)
# This line updates the visual properties of the histogram bars. It sets the marker_color to '#636EFA' 
# (a shade of blue), marker_line_color to 'white', and marker_line_width to 0.5. This customization gives the bars a consistent appearance.
fig.update_traces(marker_color='#636EFA', marker_line_color='white', marker_line_width=0.5)
# This block of code updates the layout of the figure. It sets the x-axis title to 'Number of Hours', 
# the y-axis title to 'Frequency', removes the legend, sets the plot background color to '#f2f2f2', and updates 
# the font settings to use the Arial font, size 12, and color '#333333' for the text
fig.update_layout(
    xaxis_title='Number of Hours',
    yaxis_title='Frequency',
    showlegend=False,
    plot_bgcolor='#f2f2f2',
    font=dict(
        family='Arial',
        size=12,
        color='#333333'
    )
)
# These lines update the grid properties of the x-axis and y-axis, respectively. 
# They set showgrid to True to display the grid lines, gridwidth to 0.5 to control 
# the thickness of the grid lines, and gridcolor to 'lightgray' to set the color of the grid lines
fig.update_xaxes(showgrid=True, gridwidth=0.5, gridcolor='lightgray')
fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor='lightgray')

fig.show()

Data¶

In [11]:
import plotly.graph_objects as go

# Creating the scatter plot
# This line creates an empty figure object using the go.Figure() constructor. The figure object will be used to add traces and customize the plot
fig = go.Figure()

# Adding scatter trace
# This line adds a scatter trace to the figure. It uses the go.Scatter() constructor from Plotly graph objects.
# The x and y parameters specify the data for the x-axis and y-axis, respectively. The mode parameter is set to 
# 'markers' to display individual data points. The name parameter sets the name of the trace, which will be used in the legend.
fig.add_trace(go.Scatter(x=merged_df['date'], y=merged_df['sleep_hours'], mode='markers', name='Sleep Hours'))

# Adding line trace
# This line adds a second scatter trace to the figure. It has the same data as the previous trace but the mode parameter is set to 'lines' to create a line plot instead of markers. This trace represents the trend of sleep hours over time
fig.add_trace(go.Scatter(x=merged_df['date'], y=merged_df['sleep_hours'], mode='lines', name='Sleep Hours Trend'))

# Customizing the axes labels
# This line updates the layout of the figure by setting the x-axis title to 'Date' and the y-axis title to 'Sleep Hours'. It customizes the axes labels.
fig.update_layout(xaxis_title='Date', yaxis_title='Sleep Hours')

# Customizing the grid lines
# These lines further customize the layout by adding grid lines to the x-axis and y-axis. 
# They set showgrid to True to display the grid lines, gridwidth to 0.5 to control the 
# thickness of the grid lines, and gridcolor to 'lightgray' to set the color of the grid lines
fig.update_layout(xaxis=dict(showgrid=True, gridwidth=0.5, gridcolor='lightgray'),
                  yaxis=dict(showgrid=True, gridwidth=0.5, gridcolor='lightgray'))

# Setting the title
fig.update_layout(title='Data : Sleep Hours by Date')

# Displaying the plot
fig.show()


# Summary :
# The code uses Plotly graph objects to create a scatter plot of the sleep hours by date. 
# It adds two scatter traces, one with markers representing individual sleep hour values
# and another with a line representing the trend. It customizes the axes labels, adds grid lines, sets the title, 
# and displays the plot. The final plot will show the sleep hours data with markers and a trend line,
# along with customized labels, grid lines, and title for a visually appealing and informative visualization

Training Data on different Sampling Freq¶

In [12]:
df_indexed = merged_df.set_index('date')
# These lines perform resampling on the 'sleep_hours' column of the df_indexed DataFrame.
# They aggregate the data into 36-hour and 48-hour intervals and calculate the sum of 'sleep_hours' within each interval. 
# The resulting resampled data is stored in the df_36_hourly and df_48_hourly 
df_36_hourly = df_indexed['sleep_hours'].resample('36h').sum().reset_index()
df_48_hourly = df_indexed['sleep_hours'].resample('48h').sum().reset_index()

fig = go.Figure()

# Add the 12-hourly line plot
fig.add_trace(go.Scatter(x=df_36_hourly['date'], y=df_36_hourly['sleep_hours'], mode='lines', name='36-Hourly'))
fig.add_trace(go.Scatter(x=df_36_hourly['date'], y=df_36_hourly['sleep_hours'], mode='markers', name='36-Hourly'))



fig.add_trace(go.Scatter(x=df_48_hourly['date'], y=df_48_hourly['sleep_hours'], mode='lines', name='48-Hourly'))
fig.add_trace(go.Scatter(x=df_48_hourly['date'], y=df_48_hourly['sleep_hours'], mode='markers', name='48-Hourly'))

# Customize the axes labels
fig.update_layout(xaxis_title='Date', yaxis_title='Sleep Hours')

# Customize the grid lines
fig.update_layout(xaxis=dict(showgrid=True, gridwidth=0.5, gridcolor='lightgray'),
                  yaxis=dict(showgrid=True, gridwidth=0.5, gridcolor='lightgray'))

# Set the title
fig.update_layout(title='Sleep Hours by Date')

# Display the plot
fig.show()

# summary:
# The code uses Plotly graph objects to create a plot showing the sleep hours by date at 36-hour and 48-hour intervals. 
# It adds scatter traces for both the line plot and the individual data points for each interval. 
# The axes labels, grid lines, and title are customized. The final plot will show the sleep hours
# at different intervals, with line plots and markers, along with customized labels, grid lines, 
# and title for an informative visualization of the sleep hour trends over time.

Modeling¶

Box-Jenkins Framework

  • The Box-Jenkins method is a statistical technique used for time series analysis and forecasting. The approach starts with the assumption that the process that generated the time series can be approximated using an ARMA model if it is stationary or an ARIMA model if it is non-stationary.

  • The Box-Jenkins method applies autoregressive moving average (ARMA) or autoregressive integrated moving average (ARIMA) models to find the best fit of a time-series model to past values of a time series2. The model can analyze several different types of time series data for forecasting purposes3.

ARIMA Model Pipeline

  • Autoregressive Integrated Moving Average (ARIMA) Model
    • autoregressive models: AR(p)
    • moving average models: MA(q)
    • mixed autoregressive moving average models: ARMA(p, q)
    • integration models: ARIMA(p, d, q)

Stationary Test

What is stationary Data ?

Stationary data refers to time series data that mean and variance do not vary across time. The data is considered non-stationary if there is a strong trend or seasonality observed from the data.

Why we need stationary Data for ARIMA Model ARIMA models rely on the assumption that the time series being modeled is stationary. Therefore that assumption needs to hold if you want to use these models. The ARIMA model uses differenced data to make the data stationary, which means there’s a consistency of the data over time. This function removes the effect of trends or seasonality, such as market or economic data. We make the data stationary only in case of ARIMA because the ARIMA model looks at the past data to predict future values.

In [13]:
def adfuller_test(values):
    result=adfuller(values)
    labels = ['ADF Test Statistic','p-value','#Lags Used','Number of Observations Used']
    for value,label in zip(result,labels):
        print(label+' : '+str(value) )
    if result[1] <= 0.05:
        print("P value is less than 0.05 that means we can reject the null hypothesis(Ho). Therefore we can conclude that data has no unit root and is stationary")
    else:
        print("Weak evidence against null hypothesis that means time series has a unit root which indicates that it is non-stationary ")

adfuller_test(merged_df['sleep_hours'])
ADF Test Statistic : -2.411101199370657
p-value : 0.1386276602953715
#Lags Used : 22
Number of Observations Used : 2485
Weak evidence against null hypothesis that means time series has a unit root which indicates that it is non-stationary 

Data Pipeline¶

In [14]:
time_series_daily = TimeSeries.from_dataframe(merged_df,'date','sleep_hours',freq='D' ,fill_missing_dates=True,fillna_value=True)
train , test = time_series_daily.split_after(0.80)
print('Shape of train set : ',train.pd_dataframe().shape)
print('Shape of test set : ',test.pd_dataframe().shape)
Horizan = test.pd_dataframe().shape[0]
Shape of train set :  (2006, 1)
Shape of test set :  (502, 1)

Inspecting Seasonality¶

In [15]:
from darts.utils.statistics import plot_acf, check_seasonality

for m in range(2, 25):
    is_seasonal, period = check_seasonality(time_series_daily, m=m, alpha=0.05)
    if is_seasonal:
        print("There is seasonality of order {}.".format(period))
There is seasonality of order 10.
There is seasonality of order 14.
There is seasonality of order 17.
There is seasonality of order 20.

Auto Correlation plot¶

The autocorrelation function (ACF) is used to identify the order of ARIMA models. The ACF plot shows the correlation between the time series and its lagged version. The lag at which the ACF plot crosses the upper confidence interval for the first time is considered as the order of the MA component of the ARIMA model. Similarly, if the ACF plot decays slowly, it indicates that there is a high degree of autocorrelation in the time series, which means that an AR component should be included in the ARIMA model.

In [16]:
from darts.utils.statistics import plot_acf,plot_pacf

plot_acf(time_series_daily.diff(1), m=12, max_lag=100,  fig_size=(10, 5), axis=None, default_formatting=True)
plt.xlabel('lags')
plt.ylabel('correlation')
plt.title('Auto Correlation Plot')
plt.show()

Partial Auto Correlation plot¶

The partial autocorrelation function (PACF) is also used to identify the order of ARIMA models. The PACF plot shows the correlation between the time series and its lagged version, but with the influence of the intermediate lags removed. The lag at which the PACF plot crosses the upper confidence interval for the first time is considered as the order of the AR component of the ARIMA model.

In [17]:
from darts.utils.statistics import plot_acf,plot_pacf
plot_pacf(time_series_daily, m=7, max_lag=100,  fig_size=(10, 5), axis=None, default_formatting=True)

plt.xlabel('lags')
plt.ylabel('correlation')
plt.title('Partial Auto Correlation Plot')
plt.show()

Model Fitting¶

In [18]:
from darts.models.forecasting.arima import ARIMA
# these parameters has been found using correlation plot and partial correlation plots
# i added the description there please read that paragraph # and we can also test 
# other order by try and test method
arima_model =  ARIMA(p=2 , #, for Auto regressive parameter
                     d=1  , # for difference to make the data is statioanry
                     q=3 ,  # for the moving Average,
                     seasonal_order=(3, 1, 3, 7)
                     )

arima_model.fit(train)
Out[18]:
ARIMA(p=2, d=1, q=3, seasonal_order=(3, 1, 3, 7), trend=None, random_state=0, add_encoders=None)
In [19]:
predictions = arima_model.predict(Horizan)
display(predictions)
<TimeSeries (DataArray) (date: 502, component: 1, sample: 1)>
array([[[5.9193949 ]],

       [[5.88743747]],

       [[6.1192225 ]],

       [[6.30209793]],

       [[6.3121213 ]],

       [[6.39180562]],

       [[6.32902015]],

       [[6.0761223 ]],

       [[6.14630629]],

       [[6.0773535 ]],

...

       [[5.93558361]],

       [[6.11248137]],

       [[6.07352592]],

       [[6.3344708 ]],

       [[6.21043021]],

       [[5.79617676]],

       [[5.88247547]],

       [[5.94634898]],

       [[6.08371473]],

       [[6.10547089]]])
Coordinates:
  * date       (date) datetime64[ns] 2020-08-17 2020-08-18 ... 2021-12-31
  * component  (component) object 'sleep_hours'
Dimensions without coordinates: sample
Attributes:
    static_covariates:  None
    hierarchy:          None
TimeSeries (DataArray)
  • date: 502
  • component: 1
  • sample: 1
  • 5.919 5.887 6.119 6.302 6.312 6.392 ... 5.796 5.882 5.946 6.084 6.105
    array([[[5.9193949 ]],
    
           [[5.88743747]],
    
           [[6.1192225 ]],
    
           [[6.30209793]],
    
           [[6.3121213 ]],
    
           [[6.39180562]],
    
           [[6.32902015]],
    
           [[6.0761223 ]],
    
           [[6.14630629]],
    
           [[6.0773535 ]],
    
    ...
    
           [[5.93558361]],
    
           [[6.11248137]],
    
           [[6.07352592]],
    
           [[6.3344708 ]],
    
           [[6.21043021]],
    
           [[5.79617676]],
    
           [[5.88247547]],
    
           [[5.94634898]],
    
           [[6.08371473]],
    
           [[6.10547089]]])
    • date
      (date)
      datetime64[ns]
      2020-08-17 ... 2021-12-31
      array(['2020-08-17T00:00:00.000000000', '2020-08-18T00:00:00.000000000',
             '2020-08-19T00:00:00.000000000', ..., '2021-12-29T00:00:00.000000000',
             '2021-12-30T00:00:00.000000000', '2021-12-31T00:00:00.000000000'],
            dtype='datetime64[ns]')
    • component
      (component)
      object
      'sleep_hours'
      array(['sleep_hours'], dtype=object)
    • date
      PandasIndex
      PandasIndex(DatetimeIndex(['2020-08-17', '2020-08-18', '2020-08-19', '2020-08-20',
                     '2020-08-21', '2020-08-22', '2020-08-23', '2020-08-24',
                     '2020-08-25', '2020-08-26',
                     ...
                     '2021-12-22', '2021-12-23', '2021-12-24', '2021-12-25',
                     '2021-12-26', '2021-12-27', '2021-12-28', '2021-12-29',
                     '2021-12-30', '2021-12-31'],
                    dtype='datetime64[ns]', name='date', length=502, freq='D'))
    • component
      PandasIndex
      PandasIndex(Index(['sleep_hours'], dtype='object', name='component'))
  • static_covariates :
    None
    hierarchy :
    None
In [20]:
# Convert train_series into a pandas dataframe and reset index
df_train = train.pd_dataframe().reset_index()

# Convert test_series into a pandas dataframe and reset index
df_test = test.pd_dataframe().reset_index()

# Convert prediction into a pandas dataframe and reset index
forecast = predictions.pd_dataframe().reset_index()
x_feature ='date' 
y_feature='sleep_hours'
model_name = 'Arima Prediction'
train_test_predicted_plot(df_train,df_test,x_feature,y_feature,forecast,'ARIMA-Prediction')

Evaluation Metrics¶

Suppose you have the following true and predicted time series data:

y_true = [1, 2, 3, 4, 5] y_pred = [1.2, 2.3, 3.4, 4.5, 5.6] To calculate the mean squared error (MSE) between y_true and y_pred, you would use the following formula:

  • MSE = (1/n) * sum((y_true_i - y_pred_i)^2) where n is the number of data points in the time series.

In this case, the MSE would be:

  • MSE = (1/5) * ((1-1.2)^2 + (2-2.3)^2 + (3-3.4)^2 + (4-4.5)^2 + (5-5.6)^2) = 0.26 To calculate the mean absolute percentage error (MAPE) between y_true and y_pred, you would use the following formula:

  • MAPE = (1/n) * sum(abs((y_true_i - y_pred_i)/y_true_i)) In this case, the MAPE would be:

MAPE = (1/5) * (abs((1-1.2)/1) + abs((2-2.3)/2) + abs((3-3.4)/3) + abs((4-4.5)/4) + abs((5-5.6)/5)) = 0.083 The MSE measures the average squared difference between the predicted and true values in a time series. It is a measure of how well your model fits the data.

The MAPE measures the average percentage difference between the predicted and true values in a time series. It is a measure of how well your model predicts future values.

In [21]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error  
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np

def Evaluations_metrics(y_true,y_pred):
        
    # y_true and y_pred are your true and predicted time series data
    mse_value = mean_squared_error(y_true, y_pred)
    mape_value = mean_absolute_percentage_error(y_true, y_pred)*100
    mae_value = mean_absolute_error(y_true, y_pred)
    rmse_value = np.sqrt(mean_squared_error(y_true, y_pred))
    
    print('Mean Sqaured Error(MSE) : ',mse_value)
    print('Mean absolute Percentage Error (MAPE)(percentage Error) : ',mape_value)
    print('Mean Absolute Error  : ',mae_value)
    print('Root Mean Sqaure Error :',rmse_value)
    
    
    return mse_value , mape_value ,mae_value ,rmse_value

mse_value , mape_value ,mae_value ,rmse_value = Evaluations_metrics(df_test['sleep_hours'].tolist() ,forecast['sleep_hours'].tolist() )
Mean Sqaured Error(MSE) :  1.2709446975173189
Mean absolute Percentage Error (MAPE)(percentage Error) :  13.31269939637065
Mean Absolute Error  :  0.8885399983122241
Root Mean Sqaure Error : 1.1273618307878437
In [22]:
!pip install nbconvert
Requirement already satisfied: nbconvert in c:\users\yasmi\anaconda3\lib\site-packages (6.5.4)
Requirement already satisfied: traitlets>=5.0 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (5.7.1)
Requirement already satisfied: nbclient>=0.5.0 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (0.5.13)
Requirement already satisfied: defusedxml in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (0.7.1)
Requirement already satisfied: pandocfilters>=1.4.1 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (1.5.0)
Requirement already satisfied: mistune<2,>=0.8.1 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (0.8.4)
Requirement already satisfied: pygments>=2.4.1 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (2.11.2)
Requirement already satisfied: nbformat>=5.1 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (5.7.0)
Requirement already satisfied: packaging in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (22.0)
Requirement already satisfied: lxml in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (4.9.1)
Requirement already satisfied: jupyter-core>=4.7 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (5.2.0)
Requirement already satisfied: tinycss2 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (1.2.1)
Requirement already satisfied: entrypoints>=0.2.2 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (0.4)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (2.1.1)
Requirement already satisfied: jupyterlab-pygments in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (0.1.2)
Requirement already satisfied: beautifulsoup4 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (4.11.1)
Requirement already satisfied: jinja2>=3.0 in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (3.1.2)
Requirement already satisfied: bleach in c:\users\yasmi\anaconda3\lib\site-packages (from nbconvert) (4.1.0)
Requirement already satisfied: platformdirs>=2.5 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-core>=4.7->nbconvert) (2.5.2)
Requirement already satisfied: pywin32>=1.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-core>=4.7->nbconvert) (305.1)
Requirement already satisfied: jupyter-client>=6.1.5 in c:\users\yasmi\anaconda3\lib\site-packages (from nbclient>=0.5.0->nbconvert) (7.3.4)
Requirement already satisfied: nest-asyncio in c:\users\yasmi\anaconda3\lib\site-packages (from nbclient>=0.5.0->nbconvert) (1.5.6)
Requirement already satisfied: jsonschema>=2.6 in c:\users\yasmi\anaconda3\lib\site-packages (from nbformat>=5.1->nbconvert) (4.17.3)
Requirement already satisfied: fastjsonschema in c:\users\yasmi\anaconda3\lib\site-packages (from nbformat>=5.1->nbconvert) (2.16.2)
Requirement already satisfied: soupsieve>1.2 in c:\users\yasmi\anaconda3\lib\site-packages (from beautifulsoup4->nbconvert) (2.3.2.post1)
Requirement already satisfied: six>=1.9.0 in c:\users\yasmi\anaconda3\lib\site-packages (from bleach->nbconvert) (1.16.0)
Requirement already satisfied: webencodings in c:\users\yasmi\anaconda3\lib\site-packages (from bleach->nbconvert) (0.5.1)
Requirement already satisfied: attrs>=17.4.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (22.1.0)
Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (0.18.0)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-client>=6.1.5->nbclient>=0.5.0->nbconvert) (2.8.2)
Requirement already satisfied: pyzmq>=23.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-client>=6.1.5->nbclient>=0.5.0->nbconvert) (23.2.0)
Requirement already satisfied: tornado>=6.0 in c:\users\yasmi\anaconda3\lib\site-packages (from jupyter-client>=6.1.5->nbclient>=0.5.0->nbconvert) (6.1)
In [ ]: